1 Setup

setwd("/mnt/picea/projects/aspseq/jfelten/T89-Laccaria-bicolor/Salmon")
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(gplots))
suppressPackageStartupMessages(library(hyperSpec))
suppressPackageStartupMessages(library(LSD))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(pander))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(scatterplot3d))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(tximport))
suppressPackageStartupMessages(library(vsn))
source("~/Git/UPSCb/src/R/plot.multidensity.R")
source("~/Git/UPSCb/src/R/featureSelection.R")
pal <- brewer.pal(8,"Dark2")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
samples <- read_csv("~/Git/UPSCb/projects/T89-Laccaria-bicolor/doc/Samples.csv") %>% 
  mutate(Time=factor(Time)) %>% 
  mutate(Experiment=factor(Experiment))
## Parsed with column specification:
## cols(
##   SciLifeID = col_character(),
##   SampleName = col_double(),
##   Time = col_double(),
##   Experiment = col_character(),
##   Replicate = col_character()
## )

tx2gene translation

Potra.tx2gene <- read_delim("/mnt/picea/storage/reference/Populus-tremula/v1.1/annotation/tx2gene.tsv",
                            "\t",col_names=c("TXID","GENE"))
## Parsed with column specification:
## cols(
##   TXID = col_character(),
##   GENE = col_character()
## )
Potri.tx2gene <- read_delim("/mnt/picea/storage/reference/Populus-trichocarpa/v3.0/annotation/tx2gene.tsv",
                            "\t",col_names=c("TXID","GENE"))
## Parsed with column specification:
## cols(
##   TXID = col_character(),
##   GENE = col_character()
## )

2 Laccaria bicolor data

2.1 Raw data

lb.filelist <- list.files("Lacbi2", 
                    recursive = TRUE, 
                    pattern = "quant.sf",
                    full.names = TRUE)

Select the samples containing fungi

stopifnot(all(str_which(basename(lb.filelist),samples$SciLifeID) == 1:nrow(samples)))
names(lb.filelist) <- samples$SciLifeID
lb.filelist <- lb.filelist[samples$Experiment %in% c("ECM","FLM")]

Read the expression at the gene level (there is one transcript per gene)

lb.g <- suppressMessages(tximport(files = lb.filelist, 
                                type = "salmon",txOut=TRUE))

counts <- round(lb.g$counts)

2.2 Raw Data QC analysis

Check how many genes are never expressed

sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(counts),digits=1),
        sum(sel),
        nrow(counts))
## [1] "24.6% percent (5617) of 22822 genes are not expressed"
ggplot(tibble(x=colnames(counts),y=colSums(counts)) %>% 
         bind_cols(samples[match(names(lb.filelist),samples$SciLifeID),]),
       aes(x,y,col=Experiment,fill=Time)) + geom_col() + 
  scale_y_continuous(name="reads") +
  theme(axis.text.x=element_text(angle=90),axis.title.x=element_blank())

Display the per-gene mean expression

i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.

The cumulative gene coverage is as expected

ggplot(melt(log10(rowMeans(counts))),aes(x=value)) + 
  geom_density() + ggtitle("gene mean raw counts distribution") +
  scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 5617 rows containing non-finite values (stat_density).

The same is done for the individual samples colored by condition. The gene coverage across samples is extremely similar

dat <- as.data.frame(log10(counts)) %>% utils::stack() %>% 
  mutate(Experiment=samples$Experiment[match(ind,samples$SciLifeID)]) %>% 
  mutate(Time=samples$Time[match(ind,samples$SciLifeID)])

Color by Experiment

ggplot(dat,aes(x=values,group=ind,col=Experiment)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 294928 rows containing non-finite values (stat_density).

Color by Time

ggplot(dat,aes(x=values,group=ind,col=Time)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 294928 rows containing non-finite values (stat_density).

2.3 Export

dir.create(file.path("..","analysis","salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file="../analysis/salmon/Lacbi-raw-unormalised-gene-expression_data.csv")

2.4 Data normalisation

2.4.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate

s.sel <- match(colnames(counts),samples$SciLifeID)
dds <- DESeqDataSetFromMatrix(
  countData = counts,
  colData = samples[s.sel,],
  design = ~ Experiment * Time)
## converting counts to integer mode
## factor levels were dropped which had no samples
save(dds,file="../analysis/salmon/Lacbi-dds.rda")

Check the size factors (i.e. the sequencing library size effect)

The sequencing depth is relatively variable (50 to 300 %)

dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
Table continues below
P11915_101 P11915_102 P11915_103 P11915_104 P11915_105 P11915_106
0.9883 3.221 3.189 1.53 1.51 1.907
Table continues below
P11915_107 P11915_112 P11915_113 P11915_114 P11915_115 P11915_116
1.939 1.641 0.5662 0.5859 1.38 1.465
Table continues below
P11915_117 P11915_118 P11915_123 P11915_124 P11915_125 P11915_126
1.102 0.7441 0.8141 0.4824 1.64 0.5933
Table continues below
P11915_127 P11915_128 P11915_129 P11915_134 P11915_135 P11915_136
0.5453 0.735 0.6828 0.9994 1.221 0.8101
Table continues below
P11915_137 P11915_138 P11915_139 P11915_140 P11915_145 P11915_146
0.5967 1.341 0.7234 1.153 1.797 0.8721
P11915_147 P11915_148 P11915_149 P11915_150 P11915_151
1.078 0.6854 0.6586 0.4662 0.8963
boxplot(sizes, main="Sequencing libraries size factor")

2.5 Variance Stabilising Transformation

vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
  • Validation

The variance stabilisation worked very well, the data is nearly homoscesdastic

meanSdPlot(vst[rowSums(counts)>0,])

2.6 QC on the normalised data

2.6.1 PCA

pc <- prcomp(t(vst))
  
percent <- round(summary(pc)$importance[2,]*100)

2.6.2 3 first dimensions

This looks interesting as the sample separate clearly both by Experiment and Time in the first 2 components.

mar=c(5.1,4.1,4.1,2.1)
scatterplot3d(pc$x[,1],
              pc$x[,2],
              pc$x[,3],
              xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
              ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
              zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
              color=pal[as.integer(samples$Time)[s.sel]],
              pch=c(17,19)[as.integer(samples$Experiment)[s.sel]-1])
legend("topleft",
       fill=pal[1:nlevels(samples$Time)],
       legend=levels(samples$Time))
legend("bottomright",
       pch=c(17,19),
       legend=levels(samples$Experiment)[-1])

par(mar=mar)

2.6.3 2D

pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    samples[s.sel,])

ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Experiment)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts") +
  scale_x_continuous(name=element_text(paste("PC1 (",percent[1],"%)",sep=""))) +
  scale_y_continuous(name=element_text(paste("PC2 (",percent[2],"%)",sep="")))

2.6.4 Heatmap

Filter for noise A cutoff at a VST value of 1 leaves about 15000 genes, which is adequate for the QA

conds <- factor(paste(samples$Experiment,samples$Time))[s.sel]
sels <- rangeFeatureSelect(counts=vst,
                   conditions=conds,
                   nrep=3)

  • Heatmap of “all” genes Taking into account all the genes (above a noise thresholds), the samples cluster according to what we also see in the mapping rate plot, i.e. there is a correlation with the amount of sequences in the samples.
heatmap.2(t(scale(t(vst[sels[[2]],]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

  • Heatmap of the 1000 most variable genes
ord <- order(rowSds(vst[sels[[2]],]),decreasing=TRUE) [1:1000]

The subset is enriched for higher expression values

ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
            total=rowMeans(vst[sels[[2]],])) %>%  melt(),
       aes(x=value,col=L1)) + 
  geom_density() +
  ggtitle("Density of the subset vs. overall") + 
  scale_x_continuous(name=element_text("VST expression")) + 
  theme(legend.title=element_blank())

The clustering remains the same even for the most variable genes

heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

  • Heatmap of the 1000 least variable genes
ord <- order(rowSds(vst[sels[[2]],])) [1:1000]

The subset is enriched for higher expression values, with a strinkingly normal distribution

ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
            total=rowMeans(vst[sels[[2]],])) %>%  melt(),
       aes(x=value,col=L1)) + 
  geom_density() +
  ggtitle("Density of the subset vs. overall") + 
  scale_x_continuous(name=element_text("VST expression")) + 
  theme(legend.title=element_blank())

The clustering for the least variable genes shows a separation by experiment and time point

heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

2.7 Conclusion

The quality of the data is good. The PCA shows that the samples cluster by experiment and time, however, the heatmap shows a clustering that correlates with the mapping rates, i.e. the mixed amount of reads originating from either organism. The final heatmap seem to indicate that this effect is neglectable albeit confounded.

3 T89 ( Populus tremula x Populus tremuloides ) data

3.1 Raw data

pt.filelist <- list.files("Potra01", 
                          recursive = TRUE, 
                          pattern = "quant.sf",
                          full.names = TRUE)

Select the samples containing fungi

stopifnot(all(str_which(basename(pt.filelist),samples$SciLifeID) == 1:nrow(samples)))
names(pt.filelist) <- samples$SciLifeID
pt.filelist <- pt.filelist[samples$Experiment %in% c("Cont","ECM")]

Read the expression at the gene level (there is one transcript per gene)

pt.g <- suppressMessages(tximport(files = pt.filelist, 
                                  type = "salmon",
                                  tx2gene=Potra.tx2gene))

counts <- round(pt.g$counts)

3.2 Raw Data QC analysis

Check how many genes are never expressed

sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(counts),digits=1),
        sum(sel),
        nrow(counts))
## [1] "13.8% percent (4684) of 34043 genes are not expressed"
ggplot(tibble(x=colnames(counts),y=colSums(counts)) %>% 
         bind_cols(samples[match(names(pt.filelist),samples$SciLifeID),]),
       aes(x,y,col=Experiment,fill=Time)) + geom_col() + 
  scale_y_continuous(name="reads") +
  theme(axis.text.x=element_text(angle=90),axis.title.x=element_blank())

Display the per-gene mean expression

i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.

The cumulative gene coverage is as expected

ggplot(melt(log10(rowMeans(counts))),aes(x=value)) + 
  geom_density() + ggtitle("gene mean raw counts distribution") +
  scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 4684 rows containing non-finite values (stat_density).

The same is done for the individual samples colored by condition. The gene coverage across samples is extremely similar

dat <- as.data.frame(log10(counts)) %>% utils::stack() %>% 
  mutate(Experiment=samples$Experiment[match(ind,samples$SciLifeID)]) %>% 
  mutate(Time=samples$Time[match(ind,samples$SciLifeID)])

Color by Experiment

ggplot(dat,aes(x=values,group=ind,col=Experiment)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 486035 rows containing non-finite values (stat_density).

Color by Time

ggplot(dat,aes(x=values,group=ind,col=Time)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 486035 rows containing non-finite values (stat_density).

3.3 Export

dir.create(file.path("..","analysis","salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file="../analysis/salmon/Potra-raw-unormalised-gene-expression_data.csv")

3.4 Data normalisation

3.4.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate

s.sel <- match(colnames(counts),samples$SciLifeID)
dds <- DESeqDataSetFromMatrix(
  countData = counts,
  colData = samples[s.sel,],
  design = ~ Experiment * Time)
## converting counts to integer mode
## factor levels were dropped which had no samples
save(dds,file="../analysis/salmon/Potra-dds.rda")

Check the size factors (i.e. the sequencing library size effect)

The sequencing depth is highly variable (20 to 500 %)

dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
Table continues below
P11915_104 P11915_105 P11915_106 P11915_107 P11915_108 P11915_109
0.1408 0.3295 0.2518 0.4736 3.867 2.97
Table continues below
P11915_110 P11915_111 P11915_115 P11915_116 P11915_117 P11915_118
3.018 1.813 0.2348 0.356 0.1237 0.1678
Table continues below
P11915_119 P11915_120 P11915_121 P11915_122 P11915_126 P11915_127
1.918 2.647 1.968 2.015 1.306 0.2046
Table continues below
P11915_128 P11915_129 P11915_130 P11915_131 P11915_132 P11915_133
0.5862 0.4512 2.514 2.044 3.149 2.801
Table continues below
P11915_137 P11915_138 P11915_139 P11915_140 P11915_141 P11915_142
1.614 0.7794 0.1832 0.3401 2.288 3.813
Table continues below
P11915_143 P11915_144 P11915_148 P11915_149 P11915_150 P11915_151
4.682 1.432 1.501 0.5248 1.58 0.8726
P11915_152 P11915_153 P11915_154 P11915_155
1.795 1.866 2.409 1.501
boxplot(sizes, main="Sequencing libraries size factor")

3.5 Variance Stabilising Transformation

vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
  • Validation

The variance stabilisation worked adequately

meanSdPlot(vst[rowSums(counts)>0,])

3.6 QC on the normalised data

3.6.1 PCA

pc <- prcomp(t(vst))

percent <- round(summary(pc)$importance[2,]*100)

3.6.2 3 first dimensions

This looks interesting as the sample separate also both by Experiment and Time in the first 2 components, but somewhat not as clearly as for Laccaria bicolor

mar=c(5.1,4.1,4.1,2.1)
scatterplot3d(pc$x[,1],
              pc$x[,2],
              pc$x[,3],
              xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
              ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
              zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
              color=pal[as.integer(samples$Time)[s.sel]],
              pch=c(17,19)[as.integer(samples$Experiment)[s.sel]])
legend("topleft",
       fill=pal[1:nlevels(samples$Time)],
       legend=levels(samples$Time))
legend("bottomright",
       pch=c(17,19),
       legend=levels(samples$Experiment)[-3])

par(mar=mar)

3.6.3 2D

pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    samples[s.sel,])

ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Experiment)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts") +
  scale_x_continuous(name=element_text(paste("PC1 (",percent[1],"%)",sep=""))) +
  scale_y_continuous(name=element_text(paste("PC2 (",percent[2],"%)",sep="")))

3.6.4 Heatmap

Filter for noise A cutoff at a VST value of 2 leaves about 14000 genes, which is adequate for the QA

conds <- factor(paste(samples$Experiment,samples$Time))[s.sel]
sels <- rangeFeatureSelect(counts=vst,
                           conditions=conds,
                           nrep=3)

  • Heatmap of “all” genes Taking into account all the genes (above a noise thresholds), the samples cluster according to what we also see in the mapping rate plot, i.e. there is a correlation with the amount of sequences in the samples, but it is not as striking as for Laccaria bicolor. There is clustering by time, especially for the earlier time points. The later ECM time points are closer to the earlier ECM samples, while the later control cluster together, apart from some outliers.
heatmap.2(t(scale(t(vst[sels[[3]],]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

  • Heatmap of the 1000 most variable genes
ord <- order(rowSds(vst[sels[[2]],]),decreasing=TRUE) [1:1000]

The subset is enriched for higher expression values

ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
            total=rowMeans(vst[sels[[2]],])) %>%  melt(),
       aes(x=value,col=L1)) + 
  geom_density() +
  ggtitle("Density of the subset vs. overall") + 
  scale_x_continuous(name=element_text("VST expression")) + 
  theme(legend.title=element_blank())

The clustering shows a better clustering by time and experiment, even though there are still outliers

heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

  • Heatmap of the 1000 least variable genes
ord <- order(rowSds(vst[sels[[2]],])) [1:1000]

The subset is enriched for higher expression values, with a strinkingly normal distribution

ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
            total=rowMeans(vst[sels[[2]],])) %>%  melt(),
       aes(x=value,col=L1)) + 
  geom_density() +
  ggtitle("Density of the subset vs. overall") + 
  scale_x_continuous(name=element_text("VST expression")) + 
  theme(legend.title=element_blank())

The clustering for the least variable genes is very similar to the overall profile

heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

3.7 Conclusion

The same conclusion as for Laccaria bicolor apply, despite the difference in the heatmap behaviour that may indicate some bias due to the organisms.

4 Populus trichocarpa data

4.1 Raw data

pt.filelist <- list.files("Potri03", 
                          recursive = TRUE, 
                          pattern = "quant.sf",
                          full.names = TRUE)

Select the samples containing fungi

stopifnot(all(str_which(basename(pt.filelist),samples$SciLifeID) == 1:nrow(samples)))
names(pt.filelist) <- samples$SciLifeID
pt.filelist <- pt.filelist[samples$Experiment %in% c("Cont","ECM")]

Read the expression at the gene level (there is one transcript per gene)

pt.g <- suppressMessages(tximport(files = pt.filelist, 
                                  type = "salmon",
                                  tx2gene=Potri.tx2gene))

counts <- round(pt.g$counts)

4.2 Raw Data QC analysis

Check how many genes are never expressed

sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(counts),digits=1),
        sum(sel),
        nrow(counts))
## [1] "19.5% percent (8064) of 41270 genes are not expressed"
ggplot(tibble(x=colnames(counts),y=colSums(counts)) %>% 
         bind_cols(samples[match(names(pt.filelist),samples$SciLifeID),]),
       aes(x,y,col=Experiment,fill=Time)) + geom_col() + 
  scale_y_continuous(name="reads") +
  theme(axis.text.x=element_text(angle=90),axis.title.x=element_blank())

Display the per-gene mean expression

i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.

The cumulative gene coverage is as expected

ggplot(melt(log10(rowMeans(counts))),aes(x=value)) + 
  geom_density() + ggtitle("gene mean raw counts distribution") +
  scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 8064 rows containing non-finite values (stat_density).

The same is done for the individual samples colored by condition. The gene coverage across samples is extremely similar

dat <- as.data.frame(log10(counts)) %>% utils::stack() %>% 
  mutate(Experiment=samples$Experiment[match(ind,samples$SciLifeID)]) %>% 
  mutate(Time=samples$Time[match(ind,samples$SciLifeID)])

Color by Experiment

ggplot(dat,aes(x=values,group=ind,col=Experiment)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 744113 rows containing non-finite values (stat_density).

Color by Time

ggplot(dat,aes(x=values,group=ind,col=Time)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 744113 rows containing non-finite values (stat_density).

4.3 Export

dir.create(file.path("..","analysis","salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file="../analysis/salmon/Potri-raw-unormalised-gene-expression_data.csv")

4.4 Data normalisation

4.4.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate

s.sel <- match(colnames(counts),samples$SciLifeID)
dds <- DESeqDataSetFromMatrix(
  countData = counts,
  colData = samples[s.sel,],
  design = ~ Experiment * Time)
## converting counts to integer mode
## factor levels were dropped which had no samples
save(dds,file="../analysis/salmon/Potri-dds.rda")

Check the size factors (i.e. the sequencing library size effect)

The sequencing depth is highly variable (20 to 500 %)

dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
Table continues below
P11915_104 P11915_105 P11915_106 P11915_107 P11915_108 P11915_109
0.1389 0.3277 0.2512 0.4697 3.833 2.954
Table continues below
P11915_110 P11915_111 P11915_115 P11915_116 P11915_117 P11915_118
2.999 1.803 0.2367 0.3513 0.125 0.1719
Table continues below
P11915_119 P11915_120 P11915_121 P11915_122 P11915_126 P11915_127
1.905 2.672 1.986 2.032 1.304 0.2037
Table continues below
P11915_128 P11915_129 P11915_130 P11915_131 P11915_132 P11915_133
0.5915 0.4549 2.5 2.056 3.142 2.786
Table continues below
P11915_137 P11915_138 P11915_139 P11915_140 P11915_141 P11915_142
1.649 0.793 0.1836 0.337 2.255 3.818
Table continues below
P11915_143 P11915_144 P11915_148 P11915_149 P11915_150 P11915_151
4.701 1.414 1.504 0.5314 1.572 0.8793
P11915_152 P11915_153 P11915_154 P11915_155
1.786 1.852 2.44 1.491
boxplot(sizes, main="Sequencing libraries size factor")

4.5 Variance Stabilising Transformation

vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
  • Validation

The variance stabilisation worked adequately

meanSdPlot(vst[rowSums(counts)>0,])

4.6 QC on the normalised data

4.6.1 PCA

pc <- prcomp(t(vst))

percent <- round(summary(pc)$importance[2,]*100)

4.6.2 3 first dimensions

This looks interesting as the sample separate also both by Experiment and Time in the first 2 components, but somewhat not as clearly as for Laccaria bicolor

mar=c(5.1,4.1,4.1,2.1)
scatterplot3d(pc$x[,1],
              pc$x[,2],
              pc$x[,3],
              xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
              ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
              zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
              color=pal[as.integer(samples$Time)[s.sel]],
              pch=c(17,19)[as.integer(samples$Experiment)[s.sel]])
legend("topleft",
       fill=pal[1:nlevels(samples$Time)],
       legend=levels(samples$Time))
legend("bottomright",
       pch=c(17,19),
       legend=levels(samples$Experiment)[-3])

par(mar=mar)

4.6.3 2D

pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    samples[s.sel,])

ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Experiment)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts") +
  scale_x_continuous(name=element_text(paste("PC1 (",percent[1],"%)",sep=""))) +
  scale_y_continuous(name=element_text(paste("PC2 (",percent[2],"%)",sep="")))

4.6.4 Heatmap

Filter for noise A cutoff at a VST value of 2 leaves about 14000 genes, which is adequate for the QA

conds <- factor(paste(samples$Experiment,samples$Time))[s.sel]
sels <- rangeFeatureSelect(counts=vst,
                           conditions=conds,
                           nrep=3)

  • Heatmap of “all” genes Taking into account all the genes (above a noise thresholds), the samples cluster according to what we also see in the mapping rate plot, i.e. there is a correlation with the amount of sequences in the samples, but it is not as striking as for Laccaria bicolor. There is clustering by time, especially for the earlier time points. The later ECM time points are closer to the earlier ECM samples, while the later control cluster together, apart from some outliers.
heatmap.2(t(scale(t(vst[sels[[3]],]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

  • Heatmap of the 1000 most variable genes
ord <- order(rowSds(vst[sels[[2]],]),decreasing=TRUE) [1:1000]

The subset is enriched for higher expression values

ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
            total=rowMeans(vst[sels[[2]],])) %>%  melt(),
       aes(x=value,col=L1)) + 
  geom_density() +
  ggtitle("Density of the subset vs. overall") + 
  scale_x_continuous(name=element_text("VST expression")) + 
  theme(legend.title=element_blank())

The clustering shows a better clustering by time and experiment, even though there are still outliers

heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

  • Heatmap of the 1000 least variable genes
ord <- order(rowSds(vst[sels[[2]],])) [1:1000]

The subset is enriched for higher expression values, with a strinkingly normal distribution

ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
            total=rowMeans(vst[sels[[2]],])) %>%  melt(),
       aes(x=value,col=L1)) + 
  geom_density() +
  ggtitle("Density of the subset vs. overall") + 
  scale_x_continuous(name=element_text("VST expression")) + 
  theme(legend.title=element_blank())

The clustering for the least variable genes is very similar to the overall profile

heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

4.7 Conclusion

The same conclusion as for Laccaria bicolor apply, despite the difference in the heatmap behaviour that may indicate some bias due to the organisms.

5 Session Info

## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.3 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] vsn_3.52.0                  tximport_1.12.3            
##  [3] forcats_0.4.0               stringr_1.4.0              
##  [5] dplyr_0.8.3                 purrr_0.3.2                
##  [7] readr_1.3.1                 tidyr_0.8.3                
##  [9] tibble_2.1.3                tidyverse_1.2.1            
## [11] scatterplot3d_0.3-41        RColorBrewer_1.1-2         
## [13] pander_0.6.3                LSD_4.0-0                  
## [15] hyperSpec_0.99-20180627     ggplot2_3.2.1              
## [17] lattice_0.20-38             gplots_3.0.1.1             
## [19] DESeq2_1.24.0               SummarizedExperiment_1.14.1
## [21] DelayedArray_0.10.0         BiocParallel_1.18.1        
## [23] matrixStats_0.54.0          Biobase_2.44.0             
## [25] GenomicRanges_1.36.0        GenomeInfoDb_1.20.0        
## [27] IRanges_2.18.2              S4Vectors_0.22.0           
## [29] BiocGenerics_0.30.0         data.table_1.12.2          
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.4-1       htmlTable_1.13.1       XVector_0.24.0        
##  [4] base64enc_0.1-3        rstudioapi_0.10        hexbin_1.27.3         
##  [7] affyio_1.54.0          bit64_0.9-7            AnnotationDbi_1.46.1  
## [10] lubridate_1.7.4        xml2_1.2.2             splines_3.6.1         
## [13] geneplotter_1.62.0     knitr_1.24             zeallot_0.1.0         
## [16] Formula_1.2-3          jsonlite_1.6           broom_0.5.2           
## [19] annotate_1.62.0        cluster_2.1.0          BiocManager_1.30.4    
## [22] compiler_3.6.1         httr_1.4.1             backports_1.1.4       
## [25] assertthat_0.2.1       Matrix_1.2-17          lazyeval_0.2.2        
## [28] limma_3.40.6           cli_1.1.0              acepack_1.4.1         
## [31] htmltools_0.3.6        tools_3.6.1            gtable_0.3.0          
## [34] glue_1.3.1             GenomeInfoDbData_1.2.1 affy_1.62.0           
## [37] reshape2_1.4.3         Rcpp_1.0.2             cellranger_1.1.0      
## [40] vctrs_0.2.0            preprocessCore_1.46.0  gdata_2.18.0          
## [43] nlme_3.1-141           xfun_0.9               testthat_2.2.1        
## [46] rvest_0.3.4            gtools_3.8.1           XML_3.98-1.20         
## [49] zlibbioc_1.30.0        scales_1.0.0           hms_0.5.1             
## [52] yaml_2.2.0             memoise_1.1.0          gridExtra_2.3         
## [55] rpart_4.1-15           latticeExtra_0.6-28    stringi_1.4.3         
## [58] RSQLite_2.1.2          highr_0.8              genefilter_1.66.0     
## [61] checkmate_1.9.4        caTools_1.17.1.2       rlang_0.4.0           
## [64] pkgconfig_2.0.2        bitops_1.0-6           evaluate_0.14         
## [67] labeling_0.3           htmlwidgets_1.3        bit_1.1-14            
## [70] tidyselect_0.2.5       plyr_1.8.4             magrittr_1.5          
## [73] R6_2.4.0               generics_0.0.2         Hmisc_4.2-0           
## [76] DBI_1.0.0              pillar_1.4.2           haven_2.1.1           
## [79] foreign_0.8-72         withr_2.1.2            survival_2.44-1.1     
## [82] RCurl_1.95-4.12        nnet_7.3-12            modelr_0.1.5          
## [85] crayon_1.3.4           KernSmooth_2.23-15     rmarkdown_1.15        
## [88] locfit_1.5-9.1         readxl_1.3.1           blob_1.2.0            
## [91] digest_0.6.20          xtable_1.8-4           munsell_0.5.0